Comparisons are made with the text features ignored. The features used here are therefore:
import json
import pickle
# Standard plotly imports
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import iplot
from pathlib import Path
# Data science imports
import pandas as pd
import numpy as np
# Cufflinks wrapper on plotly
import cufflinks
cufflinks.go_offline()
# Set the global theme for cufflinks
cufflinks.set_config_file(world_readable=True, theme='solar', offline=True)
cur_dir = !pwd
project_dir = Path(cur_dir[0]).resolve().parents[0]
features_path = project_dir / 'src' / 'features'
predictions_path = project_dir / 'reports'
Y_train = pickle.load(open(features_path / 'training_y.pkl', 'rb'))
predictions_train = json.load(open(predictions_path / 'predictions_train.json'))
predictions_train contains results for the following models:
for i, p in enumerate(predictions_train):
print('%d. %s' % (i, p['name']))
The hyperparameters for the GD models above are:
predictions_train[4]['hparams']
ClosedForm_no_text_train and GradientDescent_no_text_train:¶cf_train_no_text = predictions_train[3]
gd_train_no_text = predictions_train[7]
cf_prediction = cf_train_no_text['y_predicted']
gd_prediction = gd_train_no_text['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 10001)],
y=[y for [y] in Y_train.tolist()],
name = 'Target',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 10001)],
y=cf_prediction,
name = 'Closed form',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 10001)],
y=gd_prediction,
name = 'Gradient descent',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3]
layout = dict(title='Popularity Prediction on Training Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516')
fig = dict(data=data, layout=layout)
iplot(fig)
#df = pd.read_json(json.dumps(predictions_train))
#df
Observations:
cf_error = (np.array([[p] for p in cf_prediction]) - Y_train)**2
gd_error = (np.array([[p] for p in gd_prediction]) - Y_train)**2
trace1 = go.Scatter(
x=[i for i in range(1, 10001)],
y=[e for [e] in cf_error.tolist()],
name = 'Closed form',
line = dict(
color='#E85285',
width=0.5),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 10001)],
y=[e for [e] in gd_error.tolist()],
name = 'Gradient descent',
line = dict(
color='#6A1B9A',
width=0.5),
connectgaps=True
)
data = [trace1, trace2]
layout = dict(title='Squared Error for Each Training Example',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Squared error'))
fig = dict(data=data, layout=layout)
iplot(fig)
Not much to say about this chart that hasn't been said. It was plotly practice, I guess.
predictions_validate = json.load(open(predictions_path / 'predictions_validate.json'))
for i, p in enumerate(predictions_validate):
print('%d. %s' % (i, p['name']))
cf_validate_no_text = predictions_validate[3]
gd_validate_no_text = predictions_validate[7]
cf_prediction = cf_validate_no_text['y_predicted']
gd_prediction = gd_validate_no_text['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 1001)],
y=[y for [y] in Y_train.tolist()],
name = 'Target',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_prediction,
name = 'Closed form',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 1001)],
y=gd_prediction,
name = 'Gradient descent',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3]
layout = dict(title='Popularity Prediction on Validation Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516')
fig = dict(data=data, layout=layout)
iplot(fig)
Many popularity peaks are not captured by the model, which reinforces the underfitting hypothesis
trace = go.Table(
header=dict(values=['<b>Set</b>',
'<b>Model</b>',
'<b>MSE</b>',
'<b>Hyperparameters</b>',
'<b>Iterations</b>']),
cells=dict(values=[['Training', 'Training', 'Validation', 'Validation'],
['Closed Form', 'Gradient Descent', 'Closed Form', 'Gradient Descent'],
[cf_train_no_text['mse'],
gd_train_no_text['mse'],
cf_validate_no_text['mse'],
gd_validate_no_text['mse']],
['', str(gd_train_no_text['hparams']),
'', str(gd_validate_no_text['hparams'])],
['', gd_train_no_text['num_iterations'],
'', gd_validate_no_text['num_iterations']]
]))
layout = dict(title='Summary')
data = [trace]
iplot(dict(data=data,layout=layout))
The summary table seems to suggest there is no underfitting, as the models performed better on the validation set than on the training set. However, it is possible it is due to random sampling and noise.
The CF's weight vector is computed from a fixed formula - it will therefore give the same prediction for the same inputs.
On the other hand, GD's prediction depends on the hyperparameters set at training stage - different GD models will thus yield different predictions even for the same inputs.
This means GD is more unstable than CF.
Comparisons will be made using the closed form model.
cf_train_160 = predictions_train[1]
cf_train_60 = predictions_train[2]
cf_validate_160 = predictions_validate[1]
cf_validate_60 = predictions_validate[2]
trace = go.Table(
header=dict(values=['',
'<b>No Text</b>',
'<b>Top 60</b>',
'<b>Top 160</b>']),
cells=dict(values=[['<b>Training</b>','<b>Validation</b>'],
[cf_train_no_text['mse'], cf_validate_no_text['mse']],
[cf_train_60['mse'], cf_validate_60['mse']],
[cf_train_160['mse'], cf_validate_160['mse']]
]))
layout = dict(title='Closed Form MSE')
data = [trace]
iplot(dict(data=data,layout=layout))
cf_y_no_text = cf_validate_no_text['y_predicted']
cf_y_60 = cf_validate_60['y_predicted']
cf_y_160 = cf_validate_160['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 1001)],
y=[y for [y] in Y_train.tolist()],
name = '(white) Target',
line = dict(
color='white',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_no_text,
name = 'No Text',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_60,
name = 'Top 60',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace4 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_160,
name = 'Top 160',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3, trace4]
layout = dict(title='Closed Form Predictions on Validation Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516'
)
fig = dict(data=data, layout=layout)
iplot(fig)
Judging from the MSEs, top-60 is an incremental improvement over no text, and top-160 is an incremental improvement over top-60.
Even though top-160's validation MSE is the only one higher than its training's, it's a small difference (1.0477763217987115 and 1.0686390774956736).
The fact that top-160's validation MSE is lower than no text's is demonstrative that top-60 gives the best predictions out of the three.
Adding the top 160 words features overall improves our model.
cf_train = predictions_train[0]
cf_validate = predictions_validate[0]
trace = go.Table(
header=dict(values=['',
'<b>Top 160</b>',
'<b>Full</b> (top 160, length and stemming)']),
cells=dict(values=[['<b>Training</b>','<b>Validation</b>'],
[cf_train_160['mse'], cf_validate_160['mse']],
[cf_train['mse'], cf_validate['mse']]
]))
layout = dict(title='Closed Form MSE')
data = [trace]
iplot(dict(data=data,layout=layout))
predictions_test = json.load(open(predictions_path / 'predictions_test.json'))
for i, p in enumerate(predictions_test):
print('%d. %s' % (i, p['name']))
cf_test_160 = predictions_test[1]
cf_test = predictions_test[0]
trace = go.Table(
header=dict(values=['',
'<b>Top 160</b>',
'<b>Full</b> (top 160, length and stemming)']),
cells=dict(values=[['<b>Training</b>','<b>Validation</b>', '<b>Test</b>'],
[cf_train_160['mse'], cf_validate_160['mse'], cf_test_160['mse']],
[cf_train['mse'], cf_validate['mse'], cf_test['mse']]
]))
layout = dict(title='Closed Form MSE')
data = [trace]
iplot(dict(data=data,layout=layout))
cf_y_160 = cf_test_160['y_predicted']
cf_y = cf_test['y_predicted']
trace1 = go.Scatter(
x=[i for i in range(1, 1001)],
y=[y for [y] in Y_train.tolist()],
name = 'Target',
line = dict(
color='#F9A746',
width=1),
connectgaps=True
)
trace2 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y,
name = 'Full',
line = dict(
color='#E85285',
width=1),
connectgaps=True
)
trace3 = go.Scatter(
x=[i for i in range(1, 1001)],
y=cf_y_160,
name = 'Top 160',
line = dict(
color='#6A1B9A',
width=1),
connectgaps=True
)
data = [trace1, trace2, trace3]
layout = dict(title='Closed Form Predictions on Test Data',
xaxis=dict(title='Example no.'),
yaxis=dict(title='Popularity score'),
plot_bgcolor = '#151516'
)
fig = dict(data=data, layout=layout)
iplot(fig)
The full model (2 new features) only outperforms the top-160 model with the training set, and is worse in both with the validation and test set
Our best model overfits and I'm sad.